#!/usr/bin/env python
# coding: utf-8

# In[1]:


pip install nltk


# In[2]:


import nltk


# In[3]:


pip install gensim


# In[4]:


pip install pattern


# In[5]:


# NLTK has a large amount of prebuilt data that helps during the processing of human languages. Whenever we are in a working session, it is advised to download all of this data.
nltk.download()


# In[6]:


# Code Snippet 1 - Working with NLTK


# In[7]:


# We will be performing Noun-Phrase chunking in this example which is a category of chunking. Here we predefine grammatical notions that the program will use to perform the chunking.
import nltk
# The next step is to define the sentence’s structure.
# DT -> Determinant, VBP ->Verb, JJ -> Adjective, IN -> Preposition and NN -> Noun
test_phrases = [("an","DT"), ("astonishing","JJ"), ("leopard","NN"), ("is","VBP"), ("running","VBP"), ("around", "IN"), ("the","DT"),("ground","NN")]

# Chunking allows us to define the grammar as a regular expression
def_grammar = "NP:{<DT>?<JJ>*<NN>}"
parse_chunk_tests = nltk.RegexpParser(def_grammar)
parse_chunk_tests.parse(test_phrases)


# In[8]:


# Code Snippet 2 - Web Crawling and NLP


# In[10]:


# Let us now work out an example using NLP to read through a webpage and classify what the page is about
import nltk

# Let us take a webpage link and use urllib module to crawl through the contents of this page.
import urllib.request
test_resp =  urllib.request.urlopen('https://en.wikipedia.org/wiki/Mercedes-Benz')
input_page = test_resp.read()
print(input_page)


# In[11]:


# This is a static read-out of the webpage that was processed during crawling.
# We will now work with Beautiful Soup (a Python Library) that helps in cleaning the web text extracted from HTML and XML pages.
from bs4 import BeautifulSoup
soup_type = BeautifulSoup(input_page,'html5lib')
input_text = soup_type.get_text(strip = True)
print(input_text)


# In[12]:


# This is now a grouped and clean output generated by BeautifulSoup
# These are now clean data chunks. The next step is to convert these chunks of text into tokens that the NLP algorithm can use.
build_tokens = [token for token in input_text.split()]
print(build_tokens)


# In[13]:


# This output consists of tokenized words
# We now count the word frequency of the contents
# The FreqDist() function within NLTK works best for this purpose. We will preprocess the data by removing words like (at, the, a, for) that will not give meaning to the result. 
from nltk.corpus import stopwords
in_request = stopwords.words('english')
clean_text = build_tokens[:]
for token in build_tokens:
    if token in stopwords.words('english'):
        clean_text.remove(token)        
freq_dist = nltk.FreqDist(clean_text)
for k,v in freq_dist.items():
    print(str(k) + ':' + str(v))

# Lastly, we plot this output into a graph that will visually tell us what topic is most talked about throughout this web page
freq_dist.plot(20, cumulative=False)


# In[ ]:




